python ./tools/preprocess_data.py \
    --input /root/zhanggj/dataset_baichuan7B/train-00000-of-00001-a09b74b3ef9c3b56.parquet \
    --tokenizer-name-or-path /root/zhanggj/baichuan-7B-hf \
    --output-prefix /root/zhanggj/dataset_baichuan7B/alpaca \
    --workers 4 \
    --log-interval 1000 \
    --tokenizer-type PretrainedFromHF
